/* START 2_genCardSample.do */

/* this script compiles a "long panel" of 5% of card IDs. */

set more off
cap clear

/* PATHS */
qui do code/config.do

/* Initialize 12 threads */
// parallel initialize 12

/* load accessory functions */
qui do "./code/99_accessoryFunctions.do"

// local panel_flag 1 /* distinguish between panel and 2015 data */
foreach panel_flag in 0 1 {
if `panel_flag' == 1 {
		local fileList 1_7Dec2016 1_7Dec2017 1_7Jun2015 1_7Jun2016 1_7Jun2017 ///
		  1_7Jun2018 1_7Mar2016 1_7Mar2017 1_7Mar2018 1_7Sep2015 1_7Sep2016 1_7Sep2017
		global data_site ${data_panel}
		}
	else {
		local fileList 1dec2015_10dec2015 11dec2015_20dec2015 11feb2016_20feb2016 ///
		  11jan2016_20jan2016 1feb2016_10feb2016 1jan2016_10jan2016 21dec2015_31dec2015 ///
		  21feb2016_1mar2016 21jan2016_31jan2016
		global data_site ${data_local}
		}

	local i = 0
	/* Keep only observations that match card numbers in sample */
	foreach name in `fileList' {
		local i = `i' + 1
		/* use "${data_local}/sampleIDs.dta", clear */
		/* merge m:m CRD_NUM using "${data_local}/`name'.dta", keep(3) */
		/* drop _merge */
		/* save "${data_local}/_`i'.dta", replace */

		append using "make_data/`name'.dta"
		keep if CRD_NUM > 1e14 /* kill temporary cards */
		}

	hashsort CRD_NUM START
	compress

	if `panel_flag' == 1 {
		save "make_data/panel.dta", replace
		}
	else {
		save "make_data/2015.dta", replace
		}
	}


cap rm "make_data/_1.dta"

/* Run same data cleaning analysis as in `1_genLocPairs.do` */

local panel_flag 1
cap conf var CRD_NUM
if _rc != 0 {
	if `panel_flag' == 1 {
		use "make_data/panel.dta", clear
		}
	else {
		use "make_data/2015.dta", clear
		}
	}

/* check number of cards we can see across 3 years */
// gen y = year(dofc(START))
// preserve
// gcollapse (max) y_max = y (min) y_min = y, by(CRD_NUM)
// gen dy = y_max - y_min
// tab dy
// restore

/*          dy |      Freq.     Percent        Cum. */
/* ------------+----------------------------------- */
/*           0 |  6,064,398       45.58       45.58 */
/*           1 |  3,995,017       30.03       75.61 */
/*           2 |  2,021,995       15.20       90.81 */
/*           3 |  1,222,401        9.19      100.00 */
/* ------------+----------------------------------- */
/*       Total | 13,303,811      100.00             */


gen t = minutes(END-START)
keep if t >= 0 & t <= 120
cap drop if PATRON_CATG_ID_NUM == 0
drop if JRNY_ORIG_ID_NUM == . & JRNY_DEST_ID_NUM == .
replace START = . if JRNY_ORIG_ID_NUM == .
replace END = . if JRNY_DEST_ID_NUM == .
replace t = . if START == . | END == .

/* `mergeSubzones` is very slow when run on full data set. Instead,
copy pairs to separate frame, drop duplicates, merge subzones onto
origin-destination pairs, then copy them back. */
cap conf file "make_data/ODPairs.dta"
if _rc == 601 {
	frame put JRNY_ORIG_ID_NUM JRNY_DEST_ID_NUM, into(pairs)
	frame pairs: gduplicates drop
	frame pairs: hashsort JRNY_ORIG_ID_NUM JRNY_DEST_ID_NUM
	frame pairs: mergeSubzones orig
	frame pairs: mergeSubzones dest
	frame pairs: hashsort JRNY_ORIG_ID_NUM JRNY_DEST_ID_NUM
	frame pairs: foreach name in orig_area dest_area orig_pl dest_pl {
		local name2 = substr("`name'", 1, 1) + substr("`name'", 6, 1)
		encode `name', gen(`name2')
		drop `name'
		}
	frame pairs: compress
	frame pairs: save "make_data/ODPairs.dta", replace
	}
else {
	frame create pairs
	frame pairs: use "make_data/ODPairs.dta", clear
	}

frlink m:1 JRNY_ORIG_ID_NUM JRNY_DEST_ID_NUM, frame(pairs)
save, replace
frget oa da op dp, from(pairs)
save, replace
/* A REMPLIR */

/* Assumes `linkTransitStops` has been run in `1_genLocPairs.do` */
/* mergeSubzones orig */
/* mergeSubzones dest */
drop if oa == ""
drop if da == "" & END != . /* keep even if they forgot to tap out */

/* label by region */
gen or = ""
gen dr = ""
foreach x in 3 6 8 13 14 18 21 22 23 24 25 26 27 32 33 34 39 40 41 44 46 {
	replace or = "C" if op == `x'
	replace dr = "C" if dp == `x'
	}

foreach x in 48 47 45 30 17 16 12 11 7 5 4 {
	replace or = "W" if op == `x'
	replace dr = "W" if dp == `x'
	}

foreach x in 1 15 31 35 37 38 {
	replace or = "NE" if op == `x'
	replace dr = "NE" if dp == `x'
	}

foreach x in 2 10 28 29 43 {
	replace or = "E" if op == `x'
	replace dr = "E" if dp == `x'
	}

foreach x in 9 19 20 36 42 49 50 {
	replace or = "N" if op == `x'
	replace dr = "N" if dp == `x'
	}

save, replace
